#if !BLOCK_X
    #define BLOCK_X 16
#endif
#if !BLOCK_Y
    #define BLOCK_Y BLOCK_X
#endif

cbuffer CB : register(b0)
{
    int SrcWidth;
    int SrcHeight;
    int SrcPitch;
};

RWTexture2D<float>  outputTextureY : register(u0);
RWTexture2D<float2> outputTextureUV : register(u1);
Texture2D<float4>   inputTexture : register(t0);

float3 funcRGBAToNV12BT709CSC(float3 rgb)
{
    float r = rgb.x;
    float g = rgb.y;
    float b = rgb.z;
    float y = 0.182586f * r + 0.628254f * g + 0.063423f * b + 16.0f / 255.0;
    float u = -0.098397f * r - 0.338572f * g + 0.439216f * b + 128.0f / 255.0;
    float v = 0.429412f * r - 0.398942f * g - 0.040274f * b + 128.0f / 255.0;
    y = clamp(y, 0.0f, 1.0f);
    u = clamp(u, 0.0f, 1.0f);
    v = clamp(v, 0.0f, 1.0f);
    return float3(y, u, v);
}

[numthreads(BLOCK_X, BLOCK_Y, 1)] void CSMain(uint3 Gid : SV_GroupID,
                                              uint3 DTid : SV_DispatchThreadID,
                                              uint3 GTid : SV_GroupThreadID,
                                              uint  GI : SV_GroupIndex) {
    if (DTid.x >= SrcWidth || DTid.y >= SrcHeight)
        return;

    float3 bgr = inputTexture[DTid.xy].xyz;
    float3 yuv = funcRGBAToNV12BT709CSC(bgr);

    outputTextureY[DTid.xy] = yuv.x;
    if ((~DTid.x & 1) && (~DTid.y & 1))
    {
        outputTextureUV[DTid.xy / 2] = yuv.yz;
    }
}